import pandas as pd
import numpy as np
%matplotlib inline
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
wine_df= pd.read_csv("WineQT.csv")
wine_df.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | Id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | 0 |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 | 1 |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 | 2 |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 | 3 |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | 4 |
wine_df.describe()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | Id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 |
| mean | 8.311111 | 0.531339 | 0.268364 | 2.532152 | 0.086933 | 15.615486 | 45.914698 | 0.996730 | 3.311015 | 0.657708 | 10.442111 | 5.657043 | 804.969379 |
| std | 1.747595 | 0.179633 | 0.196686 | 1.355917 | 0.047267 | 10.250486 | 32.782130 | 0.001925 | 0.156664 | 0.170399 | 1.082196 | 0.805824 | 463.997116 |
| min | 4.600000 | 0.120000 | 0.000000 | 0.900000 | 0.012000 | 1.000000 | 6.000000 | 0.990070 | 2.740000 | 0.330000 | 8.400000 | 3.000000 | 0.000000 |
| 25% | 7.100000 | 0.392500 | 0.090000 | 1.900000 | 0.070000 | 7.000000 | 21.000000 | 0.995570 | 3.205000 | 0.550000 | 9.500000 | 5.000000 | 411.000000 |
| 50% | 7.900000 | 0.520000 | 0.250000 | 2.200000 | 0.079000 | 13.000000 | 37.000000 | 0.996680 | 3.310000 | 0.620000 | 10.200000 | 6.000000 | 794.000000 |
| 75% | 9.100000 | 0.640000 | 0.420000 | 2.600000 | 0.090000 | 21.000000 | 61.000000 | 0.997845 | 3.400000 | 0.730000 | 11.100000 | 6.000000 | 1209.500000 |
| max | 15.900000 | 1.580000 | 1.000000 | 15.500000 | 0.611000 | 68.000000 | 289.000000 | 1.003690 | 4.010000 | 2.000000 | 14.900000 | 8.000000 | 1597.000000 |
wine_df.describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| fixed acidity | 1143.0 | 8.311111 | 1.747595 | 4.60000 | 7.10000 | 7.90000 | 9.100000 | 15.90000 |
| volatile acidity | 1143.0 | 0.531339 | 0.179633 | 0.12000 | 0.39250 | 0.52000 | 0.640000 | 1.58000 |
| citric acid | 1143.0 | 0.268364 | 0.196686 | 0.00000 | 0.09000 | 0.25000 | 0.420000 | 1.00000 |
| residual sugar | 1143.0 | 2.532152 | 1.355917 | 0.90000 | 1.90000 | 2.20000 | 2.600000 | 15.50000 |
| chlorides | 1143.0 | 0.086933 | 0.047267 | 0.01200 | 0.07000 | 0.07900 | 0.090000 | 0.61100 |
| free sulfur dioxide | 1143.0 | 15.615486 | 10.250486 | 1.00000 | 7.00000 | 13.00000 | 21.000000 | 68.00000 |
| total sulfur dioxide | 1143.0 | 45.914698 | 32.782130 | 6.00000 | 21.00000 | 37.00000 | 61.000000 | 289.00000 |
| density | 1143.0 | 0.996730 | 0.001925 | 0.99007 | 0.99557 | 0.99668 | 0.997845 | 1.00369 |
| pH | 1143.0 | 3.311015 | 0.156664 | 2.74000 | 3.20500 | 3.31000 | 3.400000 | 4.01000 |
| sulphates | 1143.0 | 0.657708 | 0.170399 | 0.33000 | 0.55000 | 0.62000 | 0.730000 | 2.00000 |
| alcohol | 1143.0 | 10.442111 | 1.082196 | 8.40000 | 9.50000 | 10.20000 | 11.100000 | 14.90000 |
| quality | 1143.0 | 5.657043 | 0.805824 | 3.00000 | 5.00000 | 6.00000 | 6.000000 | 8.00000 |
| Id | 1143.0 | 804.969379 | 463.997116 | 0.00000 | 411.00000 | 794.00000 | 1209.500000 | 1597.00000 |
pip install seaborn matplotlib
Requirement already satisfied: seaborn in c:\users\amrendra mishra\anaconda3\lib\site-packages (0.12.2) Requirement already satisfied: matplotlib in c:\users\amrendra mishra\anaconda3\lib\site-packages (3.7.2) Requirement already satisfied: numpy!=1.24.0,>=1.17 in c:\users\amrendra mishra\anaconda3\lib\site-packages (from seaborn) (1.24.3) Requirement already satisfied: pandas>=0.25 in c:\users\amrendra mishra\anaconda3\lib\site-packages (from seaborn) (2.0.3) Requirement already satisfied: contourpy>=1.0.1 in c:\users\amrendra mishra\anaconda3\lib\site-packages (from matplotlib) (1.0.5) Requirement already satisfied: cycler>=0.10 in c:\users\amrendra mishra\anaconda3\lib\site-packages (from matplotlib) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\amrendra mishra\anaconda3\lib\site-packages (from matplotlib) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\amrendra mishra\anaconda3\lib\site-packages (from matplotlib) (1.4.4) Requirement already satisfied: packaging>=20.0 in c:\users\amrendra mishra\anaconda3\lib\site-packages (from matplotlib) (23.1) Requirement already satisfied: pillow>=6.2.0 in c:\users\amrendra mishra\anaconda3\lib\site-packages (from matplotlib) (9.4.0) Requirement already satisfied: pyparsing<3.1,>=2.3.1 in c:\users\amrendra mishra\anaconda3\lib\site-packages (from matplotlib) (3.0.9) Requirement already satisfied: python-dateutil>=2.7 in c:\users\amrendra mishra\anaconda3\lib\site-packages (from matplotlib) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\amrendra mishra\anaconda3\lib\site-packages (from pandas>=0.25->seaborn) (2023.3.post1) Requirement already satisfied: tzdata>=2022.1 in c:\users\amrendra mishra\anaconda3\lib\site-packages (from pandas>=0.25->seaborn) (2023.3) Requirement already satisfied: six>=1.5 in c:\users\amrendra mishra\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0) Note: you may need to restart the kernel to use updated packages.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Load the dataset
data = pd.read_csv("WineQT.csv")
# Create pairplot
sns.pairplot(data)
plt.show()
C:\Users\Amrendra Mishra\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
import pandas as pd
from sklearn.model_selection import train_test_split
# Load the dataset
data = pd.read_csv("WineQT.csv")
# Separate features (X) and target variable (y)
X = data.drop(columns=['free sulfur dioxide']) # Replace 'target_column' with the name of your target column
y = data['free sulfur dioxide'] # Replace 'target_column' with the name of your target column
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# You can adjust the test_size parameter to change the proportion of the dataset allocated for testing
# random_state ensures reproducibility, you can change it to any integer value or set it to None for random split
# Now X_train and y_train contain the features and target variable for training, respectively
# And X_test and y_test contain the features and target variable for testing, respectively
X
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | total sulfur dioxide | density | pH | sulphates | alcohol | quality | Id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 | 0 |
| 1 | 7.8 | 0.880 | 0.00 | 2.6 | 0.098 | 67.0 | 0.99680 | 3.20 | 0.68 | 9.8 | 5 | 1 |
| 2 | 7.8 | 0.760 | 0.04 | 2.3 | 0.092 | 54.0 | 0.99700 | 3.26 | 0.65 | 9.8 | 5 | 2 |
| 3 | 11.2 | 0.280 | 0.56 | 1.9 | 0.075 | 60.0 | 0.99800 | 3.16 | 0.58 | 9.8 | 6 | 3 |
| 4 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1138 | 6.3 | 0.510 | 0.13 | 2.3 | 0.076 | 40.0 | 0.99574 | 3.42 | 0.75 | 11.0 | 6 | 1592 |
| 1139 | 6.8 | 0.620 | 0.08 | 1.9 | 0.068 | 38.0 | 0.99651 | 3.42 | 0.82 | 9.5 | 6 | 1593 |
| 1140 | 6.2 | 0.600 | 0.08 | 2.0 | 0.090 | 44.0 | 0.99490 | 3.45 | 0.58 | 10.5 | 5 | 1594 |
| 1141 | 5.9 | 0.550 | 0.10 | 2.2 | 0.062 | 51.0 | 0.99512 | 3.52 | 0.76 | 11.2 | 6 | 1595 |
| 1142 | 5.9 | 0.645 | 0.12 | 2.0 | 0.075 | 44.0 | 0.99547 | 3.57 | 0.71 | 10.2 | 5 | 1597 |
1143 rows × 12 columns
y
0 11.0
1 25.0
2 15.0
3 17.0
4 11.0
...
1138 29.0
1139 28.0
1140 32.0
1141 39.0
1142 32.0
Name: free sulfur dioxide, Length: 1143, dtype: float64
X_train
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | total sulfur dioxide | density | pH | sulphates | alcohol | quality | Id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12 | 8.5 | 0.28 | 0.56 | 1.8 | 0.092 | 103.0 | 0.99690 | 3.30 | 0.75 | 10.5 | 7 | 16 |
| 758 | 9.9 | 0.32 | 0.56 | 2.0 | 0.073 | 8.0 | 0.99534 | 3.15 | 0.73 | 11.4 | 6 | 1076 |
| 636 | 8.9 | 0.31 | 0.36 | 2.6 | 0.056 | 39.0 | 0.99562 | 3.40 | 0.69 | 11.8 | 5 | 900 |
| 1109 | 6.6 | 0.88 | 0.04 | 2.2 | 0.066 | 20.0 | 0.99636 | 3.53 | 0.56 | 9.9 | 5 | 1556 |
| 743 | 7.6 | 0.42 | 0.25 | 3.9 | 0.104 | 90.0 | 0.99784 | 3.15 | 0.57 | 9.1 | 5 | 1057 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1044 | 6.7 | 1.04 | 0.08 | 2.3 | 0.067 | 32.0 | 0.99648 | 3.52 | 0.57 | 11.0 | 4 | 1467 |
| 1095 | 8.0 | 0.39 | 0.30 | 1.9 | 0.074 | 84.0 | 0.99717 | 3.39 | 0.61 | 9.0 | 5 | 1533 |
| 1130 | 7.4 | 0.35 | 0.33 | 2.4 | 0.068 | 26.0 | 0.99470 | 3.36 | 0.60 | 11.9 | 6 | 1580 |
| 860 | 7.9 | 0.57 | 0.31 | 2.0 | 0.079 | 79.0 | 0.99677 | 3.29 | 0.69 | 9.5 | 6 | 1216 |
| 1126 | 7.5 | 0.52 | 0.40 | 2.2 | 0.060 | 20.0 | 0.99474 | 3.26 | 0.64 | 11.8 | 6 | 1575 |
914 rows × 12 columns
X_test
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | total sulfur dioxide | density | pH | sulphates | alcohol | quality | Id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 158 | 6.8 | 0.610 | 0.04 | 1.5 | 0.057 | 10.0 | 0.99525 | 3.42 | 0.60 | 9.500000 | 5 | 222 |
| 1081 | 6.9 | 0.840 | 0.21 | 4.1 | 0.074 | 65.0 | 0.99842 | 3.53 | 0.72 | 9.233333 | 6 | 1514 |
| 291 | 7.0 | 0.580 | 0.12 | 1.9 | 0.091 | 124.0 | 0.99560 | 3.44 | 0.48 | 10.500000 | 5 | 417 |
| 538 | 7.8 | 0.480 | 0.68 | 1.7 | 0.415 | 32.0 | 0.99656 | 3.09 | 1.06 | 9.100000 | 6 | 754 |
| 367 | 12.5 | 0.600 | 0.49 | 4.3 | 0.100 | 14.0 | 1.00100 | 3.25 | 0.74 | 11.900000 | 6 | 516 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 66 | 5.0 | 1.020 | 0.04 | 1.4 | 0.045 | 85.0 | 0.99380 | 3.75 | 0.48 | 10.500000 | 4 | 94 |
| 328 | 10.3 | 0.500 | 0.42 | 2.0 | 0.069 | 51.0 | 0.99820 | 3.16 | 0.72 | 11.500000 | 6 | 466 |
| 67 | 6.8 | 0.775 | 0.00 | 3.0 | 0.102 | 23.0 | 0.99650 | 3.45 | 0.56 | 10.700000 | 5 | 96 |
| 231 | 10.0 | 0.490 | 0.20 | 11.0 | 0.071 | 50.0 | 1.00150 | 3.16 | 0.69 | 9.200000 | 6 | 325 |
| 966 | 11.6 | 0.475 | 0.40 | 1.4 | 0.091 | 28.0 | 0.99704 | 3.07 | 0.65 | 10.033333 | 6 | 1359 |
229 rows × 12 columns
y_train
12 35.0
758 3.0
636 10.0
1109 12.0
743 28.0
...
1044 19.0
1095 32.0
1130 9.0
860 10.0
1126 12.0
Name: free sulfur dioxide, Length: 914, dtype: float64
y_test
158 5.0
1081 16.0
291 34.0
538 14.0
367 5.0
...
66 41.0
328 21.0
67 8.0
231 13.0
966 6.0
Name: free sulfur dioxide, Length: 229, dtype: float64
X_train, X_test, y_train, y_test
( fixed acidity volatile acidity citric acid residual sugar chlorides \
12 8.5 0.28 0.56 1.8 0.092
758 9.9 0.32 0.56 2.0 0.073
636 8.9 0.31 0.36 2.6 0.056
1109 6.6 0.88 0.04 2.2 0.066
743 7.6 0.42 0.25 3.9 0.104
... ... ... ... ... ...
1044 6.7 1.04 0.08 2.3 0.067
1095 8.0 0.39 0.30 1.9 0.074
1130 7.4 0.35 0.33 2.4 0.068
860 7.9 0.57 0.31 2.0 0.079
1126 7.5 0.52 0.40 2.2 0.060
total sulfur dioxide density pH sulphates alcohol quality Id
12 103.0 0.99690 3.30 0.75 10.5 7 16
758 8.0 0.99534 3.15 0.73 11.4 6 1076
636 39.0 0.99562 3.40 0.69 11.8 5 900
1109 20.0 0.99636 3.53 0.56 9.9 5 1556
743 90.0 0.99784 3.15 0.57 9.1 5 1057
... ... ... ... ... ... ... ...
1044 32.0 0.99648 3.52 0.57 11.0 4 1467
1095 84.0 0.99717 3.39 0.61 9.0 5 1533
1130 26.0 0.99470 3.36 0.60 11.9 6 1580
860 79.0 0.99677 3.29 0.69 9.5 6 1216
1126 20.0 0.99474 3.26 0.64 11.8 6 1575
[914 rows x 12 columns],
fixed acidity volatile acidity citric acid residual sugar chlorides \
158 6.8 0.610 0.04 1.5 0.057
1081 6.9 0.840 0.21 4.1 0.074
291 7.0 0.580 0.12 1.9 0.091
538 7.8 0.480 0.68 1.7 0.415
367 12.5 0.600 0.49 4.3 0.100
... ... ... ... ... ...
66 5.0 1.020 0.04 1.4 0.045
328 10.3 0.500 0.42 2.0 0.069
67 6.8 0.775 0.00 3.0 0.102
231 10.0 0.490 0.20 11.0 0.071
966 11.6 0.475 0.40 1.4 0.091
total sulfur dioxide density pH sulphates alcohol quality Id
158 10.0 0.99525 3.42 0.60 9.500000 5 222
1081 65.0 0.99842 3.53 0.72 9.233333 6 1514
291 124.0 0.99560 3.44 0.48 10.500000 5 417
538 32.0 0.99656 3.09 1.06 9.100000 6 754
367 14.0 1.00100 3.25 0.74 11.900000 6 516
... ... ... ... ... ... ... ...
66 85.0 0.99380 3.75 0.48 10.500000 4 94
328 51.0 0.99820 3.16 0.72 11.500000 6 466
67 23.0 0.99650 3.45 0.56 10.700000 5 96
231 50.0 1.00150 3.16 0.69 9.200000 6 325
966 28.0 0.99704 3.07 0.65 10.033333 6 1359
[229 rows x 12 columns],
12 35.0
758 3.0
636 10.0
1109 12.0
743 28.0
...
1044 19.0
1095 32.0
1130 9.0
860 10.0
1126 12.0
Name: free sulfur dioxide, Length: 914, dtype: float64,
158 5.0
1081 16.0
291 34.0
538 14.0
367 5.0
...
66 41.0
328 21.0
67 8.0
231 13.0
966 6.0
Name: free sulfur dioxide, Length: 229, dtype: float64)
X.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | total sulfur dioxide | density | pH | sulphates | alcohol | quality | Id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | 0 |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 | 1 |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 | 2 |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 | 3 |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | 4 |
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Load the dataset
data = pd.read_csv("WineQT.csv")
data
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | Id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 | 0 |
| 1 | 7.8 | 0.880 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.99680 | 3.20 | 0.68 | 9.8 | 5 | 1 |
| 2 | 7.8 | 0.760 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.99700 | 3.26 | 0.65 | 9.8 | 5 | 2 |
| 3 | 11.2 | 0.280 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.99800 | 3.16 | 0.58 | 9.8 | 6 | 3 |
| 4 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1138 | 6.3 | 0.510 | 0.13 | 2.3 | 0.076 | 29.0 | 40.0 | 0.99574 | 3.42 | 0.75 | 11.0 | 6 | 1592 |
| 1139 | 6.8 | 0.620 | 0.08 | 1.9 | 0.068 | 28.0 | 38.0 | 0.99651 | 3.42 | 0.82 | 9.5 | 6 | 1593 |
| 1140 | 6.2 | 0.600 | 0.08 | 2.0 | 0.090 | 32.0 | 44.0 | 0.99490 | 3.45 | 0.58 | 10.5 | 5 | 1594 |
| 1141 | 5.9 | 0.550 | 0.10 | 2.2 | 0.062 | 39.0 | 51.0 | 0.99512 | 3.52 | 0.76 | 11.2 | 6 | 1595 |
| 1142 | 5.9 | 0.645 | 0.12 | 2.0 | 0.075 | 32.0 | 44.0 | 0.99547 | 3.57 | 0.71 | 10.2 | 5 | 1597 |
1143 rows × 13 columns
# Separate features (X) and target variable (y)
X = data.drop(columns=['residual sugar'])
y = data['residual sugar']
X
| fixed acidity | volatile acidity | citric acid | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | Id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.700 | 0.00 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 | 0 |
| 1 | 7.8 | 0.880 | 0.00 | 0.098 | 25.0 | 67.0 | 0.99680 | 3.20 | 0.68 | 9.8 | 5 | 1 |
| 2 | 7.8 | 0.760 | 0.04 | 0.092 | 15.0 | 54.0 | 0.99700 | 3.26 | 0.65 | 9.8 | 5 | 2 |
| 3 | 11.2 | 0.280 | 0.56 | 0.075 | 17.0 | 60.0 | 0.99800 | 3.16 | 0.58 | 9.8 | 6 | 3 |
| 4 | 7.4 | 0.700 | 0.00 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1138 | 6.3 | 0.510 | 0.13 | 0.076 | 29.0 | 40.0 | 0.99574 | 3.42 | 0.75 | 11.0 | 6 | 1592 |
| 1139 | 6.8 | 0.620 | 0.08 | 0.068 | 28.0 | 38.0 | 0.99651 | 3.42 | 0.82 | 9.5 | 6 | 1593 |
| 1140 | 6.2 | 0.600 | 0.08 | 0.090 | 32.0 | 44.0 | 0.99490 | 3.45 | 0.58 | 10.5 | 5 | 1594 |
| 1141 | 5.9 | 0.550 | 0.10 | 0.062 | 39.0 | 51.0 | 0.99512 | 3.52 | 0.76 | 11.2 | 6 | 1595 |
| 1142 | 5.9 | 0.645 | 0.12 | 0.075 | 32.0 | 44.0 | 0.99547 | 3.57 | 0.71 | 10.2 | 5 | 1597 |
1143 rows × 12 columns
y
0 1.9
1 2.6
2 2.3
3 1.9
4 1.9
...
1138 2.3
1139 1.9
1140 2.0
1141 2.2
1142 2.0
Name: residual sugar, Length: 1143, dtype: float64
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train
| fixed acidity | volatile acidity | citric acid | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | Id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12 | 8.5 | 0.28 | 0.56 | 0.092 | 35.0 | 103.0 | 0.99690 | 3.30 | 0.75 | 10.5 | 7 | 16 |
| 758 | 9.9 | 0.32 | 0.56 | 0.073 | 3.0 | 8.0 | 0.99534 | 3.15 | 0.73 | 11.4 | 6 | 1076 |
| 636 | 8.9 | 0.31 | 0.36 | 0.056 | 10.0 | 39.0 | 0.99562 | 3.40 | 0.69 | 11.8 | 5 | 900 |
| 1109 | 6.6 | 0.88 | 0.04 | 0.066 | 12.0 | 20.0 | 0.99636 | 3.53 | 0.56 | 9.9 | 5 | 1556 |
| 743 | 7.6 | 0.42 | 0.25 | 0.104 | 28.0 | 90.0 | 0.99784 | 3.15 | 0.57 | 9.1 | 5 | 1057 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1044 | 6.7 | 1.04 | 0.08 | 0.067 | 19.0 | 32.0 | 0.99648 | 3.52 | 0.57 | 11.0 | 4 | 1467 |
| 1095 | 8.0 | 0.39 | 0.30 | 0.074 | 32.0 | 84.0 | 0.99717 | 3.39 | 0.61 | 9.0 | 5 | 1533 |
| 1130 | 7.4 | 0.35 | 0.33 | 0.068 | 9.0 | 26.0 | 0.99470 | 3.36 | 0.60 | 11.9 | 6 | 1580 |
| 860 | 7.9 | 0.57 | 0.31 | 0.079 | 10.0 | 79.0 | 0.99677 | 3.29 | 0.69 | 9.5 | 6 | 1216 |
| 1126 | 7.5 | 0.52 | 0.40 | 0.060 | 12.0 | 20.0 | 0.99474 | 3.26 | 0.64 | 11.8 | 6 | 1575 |
914 rows × 12 columns
X_test
| fixed acidity | volatile acidity | citric acid | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | Id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 158 | 6.8 | 0.610 | 0.04 | 0.057 | 5.0 | 10.0 | 0.99525 | 3.42 | 0.60 | 9.500000 | 5 | 222 |
| 1081 | 6.9 | 0.840 | 0.21 | 0.074 | 16.0 | 65.0 | 0.99842 | 3.53 | 0.72 | 9.233333 | 6 | 1514 |
| 291 | 7.0 | 0.580 | 0.12 | 0.091 | 34.0 | 124.0 | 0.99560 | 3.44 | 0.48 | 10.500000 | 5 | 417 |
| 538 | 7.8 | 0.480 | 0.68 | 0.415 | 14.0 | 32.0 | 0.99656 | 3.09 | 1.06 | 9.100000 | 6 | 754 |
| 367 | 12.5 | 0.600 | 0.49 | 0.100 | 5.0 | 14.0 | 1.00100 | 3.25 | 0.74 | 11.900000 | 6 | 516 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 66 | 5.0 | 1.020 | 0.04 | 0.045 | 41.0 | 85.0 | 0.99380 | 3.75 | 0.48 | 10.500000 | 4 | 94 |
| 328 | 10.3 | 0.500 | 0.42 | 0.069 | 21.0 | 51.0 | 0.99820 | 3.16 | 0.72 | 11.500000 | 6 | 466 |
| 67 | 6.8 | 0.775 | 0.00 | 0.102 | 8.0 | 23.0 | 0.99650 | 3.45 | 0.56 | 10.700000 | 5 | 96 |
| 231 | 10.0 | 0.490 | 0.20 | 0.071 | 13.0 | 50.0 | 1.00150 | 3.16 | 0.69 | 9.200000 | 6 | 325 |
| 966 | 11.6 | 0.475 | 0.40 | 0.091 | 6.0 | 28.0 | 0.99704 | 3.07 | 0.65 | 10.033333 | 6 | 1359 |
229 rows × 12 columns
y_train
12 1.8
758 2.0
636 2.6
1109 2.2
743 3.9
...
1044 2.3
1095 1.9
1130 2.4
860 2.0
1126 2.2
Name: residual sugar, Length: 914, dtype: float64
y_test
158 1.5
1081 4.1
291 1.9
538 1.7
367 4.3
...
66 1.4
328 2.0
67 3.0
231 11.0
966 1.4
Name: residual sugar, Length: 229, dtype: float64
X_train, X_test, y_train, y_test
( fixed acidity volatile acidity citric acid chlorides \
12 8.5 0.28 0.56 0.092
758 9.9 0.32 0.56 0.073
636 8.9 0.31 0.36 0.056
1109 6.6 0.88 0.04 0.066
743 7.6 0.42 0.25 0.104
... ... ... ... ...
1044 6.7 1.04 0.08 0.067
1095 8.0 0.39 0.30 0.074
1130 7.4 0.35 0.33 0.068
860 7.9 0.57 0.31 0.079
1126 7.5 0.52 0.40 0.060
free sulfur dioxide total sulfur dioxide density pH sulphates \
12 35.0 103.0 0.99690 3.30 0.75
758 3.0 8.0 0.99534 3.15 0.73
636 10.0 39.0 0.99562 3.40 0.69
1109 12.0 20.0 0.99636 3.53 0.56
743 28.0 90.0 0.99784 3.15 0.57
... ... ... ... ... ...
1044 19.0 32.0 0.99648 3.52 0.57
1095 32.0 84.0 0.99717 3.39 0.61
1130 9.0 26.0 0.99470 3.36 0.60
860 10.0 79.0 0.99677 3.29 0.69
1126 12.0 20.0 0.99474 3.26 0.64
alcohol quality Id
12 10.5 7 16
758 11.4 6 1076
636 11.8 5 900
1109 9.9 5 1556
743 9.1 5 1057
... ... ... ...
1044 11.0 4 1467
1095 9.0 5 1533
1130 11.9 6 1580
860 9.5 6 1216
1126 11.8 6 1575
[914 rows x 12 columns],
fixed acidity volatile acidity citric acid chlorides \
158 6.8 0.610 0.04 0.057
1081 6.9 0.840 0.21 0.074
291 7.0 0.580 0.12 0.091
538 7.8 0.480 0.68 0.415
367 12.5 0.600 0.49 0.100
... ... ... ... ...
66 5.0 1.020 0.04 0.045
328 10.3 0.500 0.42 0.069
67 6.8 0.775 0.00 0.102
231 10.0 0.490 0.20 0.071
966 11.6 0.475 0.40 0.091
free sulfur dioxide total sulfur dioxide density pH sulphates \
158 5.0 10.0 0.99525 3.42 0.60
1081 16.0 65.0 0.99842 3.53 0.72
291 34.0 124.0 0.99560 3.44 0.48
538 14.0 32.0 0.99656 3.09 1.06
367 5.0 14.0 1.00100 3.25 0.74
... ... ... ... ... ...
66 41.0 85.0 0.99380 3.75 0.48
328 21.0 51.0 0.99820 3.16 0.72
67 8.0 23.0 0.99650 3.45 0.56
231 13.0 50.0 1.00150 3.16 0.69
966 6.0 28.0 0.99704 3.07 0.65
alcohol quality Id
158 9.500000 5 222
1081 9.233333 6 1514
291 10.500000 5 417
538 9.100000 6 754
367 11.900000 6 516
... ... ... ...
66 10.500000 4 94
328 11.500000 6 466
67 10.700000 5 96
231 9.200000 6 325
966 10.033333 6 1359
[229 rows x 12 columns],
12 1.8
758 2.0
636 2.6
1109 2.2
743 3.9
...
1044 2.3
1095 1.9
1130 2.4
860 2.0
1126 2.2
Name: residual sugar, Length: 914, dtype: float64,
158 1.5
1081 4.1
291 1.9
538 1.7
367 4.3
...
66 1.4
328 2.0
67 3.0
231 11.0
966 1.4
Name: residual sugar, Length: 229, dtype: float64)
# Create and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
model
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
# Predict on the testing set
y_pred = model.predict(X_test)
y_pred
array([0.85541521, 2.92111251, 2.63536686, 1.92619682, 4.331679 ,
1.75598103, 1.94732319, 2.36361949, 2.91919823, 3.23236467,
5.15313611, 2.45247144, 3.71430601, 1.99506139, 1.31946845,
3.34649716, 2.17391224, 2.47120965, 1.506877 , 2.79426165,
2.20532127, 4.70751107, 1.55147743, 3.22462835, 2.71968141,
2.42767196, 1.43147484, 2.40167697, 1.88174965, 2.00613936,
0.99549929, 2.67300205, 2.31174015, 3.27338509, 3.85049301,
2.49072281, 4.09469182, 1.95046287, 1.43350157, 3.03271183,
1.91064408, 2.23091637, 1.91110204, 1.7637472 , 2.75759394,
1.81628855, 2.64938726, 3.94136753, 2.55747901, 2.62551073,
2.73428007, 1.16220816, 2.1234848 , 3.62221452, 2.16403308,
2.80646819, 3.48760372, 3.46626603, 1.38440336, 2.21042875,
3.5384896 , 2.16190474, 2.86238898, 1.51511526, 2.29287929,
2.34895994, 2.56619979, 1.79924575, 1.8999282 , 3.29620955,
1.63179561, 2.27730585, 3.57597294, 2.57413658, 0.79618097,
2.95683765, 3.71201254, 2.18358794, 2.03938304, 2.70395737,
2.56943907, 1.74233346, 1.66961944, 2.17169633, 2.4947307 ,
3.24701787, 1.59514234, 2.5149015 , 2.52151378, 2.09963638,
1.62754687, 3.54913462, 1.49919553, 2.96006375, 2.55367904,
2.6988049 , 2.2181423 , 1.36207501, 5.69745192, 2.17364486,
1.62983294, 0.57313497, 3.29837452, 2.6048344 , 0.93686778,
2.53168973, 2.69875086, 4.30194818, 2.39567333, 2.60893018,
2.68118207, 3.82093662, 2.76493366, 1.9390335 , 2.79788398,
3.90940035, 3.24328196, 4.96162822, 2.04815173, 3.41905473,
2.10018356, 1.96929527, 3.78063351, 2.19470648, 1.65359734,
2.39730946, 2.99705226, 2.27067671, 6.74553205, 5.55676218,
3.04821391, 5.7208383 , 4.02036856, 2.40655335, 3.13728661,
1.68318852, 1.56006356, 1.90094849, 2.40072525, 2.00165814,
2.8853239 , 3.6623974 , 2.03207865, 2.34344196, 4.88695374,
2.37672362, 2.65805255, 2.19605287, 1.2830147 , 2.34856352,
1.75110229, 2.459089 , 1.57307914, 4.47397795, 4.47888361,
3.71105471, 2.19609707, 3.04600698, 3.21016267, 1.56594803,
3.16449441, 2.65654139, 2.17192873, 1.81554657, 3.44116137,
1.46323177, 2.16778798, 1.91420767, 2.07087609, 3.70442977,
1.87853691, 2.13720793, 1.92701549, 3.40430645, 2.26611668,
2.66917264, 1.33530893, 4.08564501, 3.5392085 , 2.60315255,
1.46113111, 1.81873912, 2.93226707, 1.98074507, 3.09248022,
1.97618361, 1.3009186 , 2.72756022, 1.94760998, 0.86886414,
2.13950893, 2.79460457, 2.22130922, 1.9196715 , 2.44655563,
2.13907611, 2.66854952, 2.8499266 , 1.70700774, 3.24633934,
3.86595139, 2.77518389, 3.27114474, 2.34880886, 1.56091779,
4.41940545, 2.79219165, 2.89449021, 1.8158406 , 1.78913024,
1.28541397, 2.97242352, 2.92343271, 2.99734577, 2.61700827,
2.65152295, 5.69172007, 1.92005135, 2.3799246 , 2.28626639,
3.43671122, 1.47179666, 2.52798902, 2.531343 , 1.20638151,
3.92795849, 2.8448967 , 4.71854193, 1.0577258 ])
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
Mean Squared Error: 1.2503677493687742
mse
1.2503677493687742
# K-FOLD CROSS VALIDATION
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Load the dataset
wine_data = pd.read_csv("wineQT.csv")
# Separate features (X) and target variable (y)
X = wine_data.drop(columns=['quality'])
y = wine_data['quality']
X
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | Id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 0 |
| 1 | 7.8 | 0.880 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.99680 | 3.20 | 0.68 | 9.8 | 1 |
| 2 | 7.8 | 0.760 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.99700 | 3.26 | 0.65 | 9.8 | 2 |
| 3 | 11.2 | 0.280 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.99800 | 3.16 | 0.58 | 9.8 | 3 |
| 4 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1138 | 6.3 | 0.510 | 0.13 | 2.3 | 0.076 | 29.0 | 40.0 | 0.99574 | 3.42 | 0.75 | 11.0 | 1592 |
| 1139 | 6.8 | 0.620 | 0.08 | 1.9 | 0.068 | 28.0 | 38.0 | 0.99651 | 3.42 | 0.82 | 9.5 | 1593 |
| 1140 | 6.2 | 0.600 | 0.08 | 2.0 | 0.090 | 32.0 | 44.0 | 0.99490 | 3.45 | 0.58 | 10.5 | 1594 |
| 1141 | 5.9 | 0.550 | 0.10 | 2.2 | 0.062 | 39.0 | 51.0 | 0.99512 | 3.52 | 0.76 | 11.2 | 1595 |
| 1142 | 5.9 | 0.645 | 0.12 | 2.0 | 0.075 | 32.0 | 44.0 | 0.99547 | 3.57 | 0.71 | 10.2 | 1597 |
1143 rows × 12 columns
y
0 5
1 5
2 5
3 6
4 5
..
1138 6
1139 6
1140 5
1141 6
1142 5
Name: quality, Length: 1143, dtype: int64
# Define the number of folds for cross-validation
k = 5
# Initialize KFold object
kf = KFold(n_splits=k, shuffle=True, random_state=42)
kf
KFold(n_splits=5, random_state=42, shuffle=True)
# Initialize lists to store the evaluation metrics
mse_scores = []
mse_scores
[]
# Iterate over each fold
for train_index, test_index in kf.split(X):
# Split data into training and testing sets
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
X_train, X_test
( fixed acidity volatile acidity citric acid residual sugar chlorides \
0 7.4 0.70 0.00 1.9 0.076
2 7.8 0.76 0.04 2.3 0.092
3 11.2 0.28 0.56 1.9 0.075
4 7.4 0.70 0.00 1.9 0.076
5 7.4 0.66 0.00 1.8 0.075
... ... ... ... ... ...
1137 5.4 0.74 0.09 1.7 0.089
1138 6.3 0.51 0.13 2.3 0.076
1139 6.8 0.62 0.08 1.9 0.068
1140 6.2 0.60 0.08 2.0 0.090
1141 5.9 0.55 0.10 2.2 0.062
free sulfur dioxide total sulfur dioxide density pH sulphates \
0 11.0 34.0 0.99780 3.51 0.56
2 15.0 54.0 0.99700 3.26 0.65
3 17.0 60.0 0.99800 3.16 0.58
4 11.0 34.0 0.99780 3.51 0.56
5 13.0 40.0 0.99780 3.51 0.56
... ... ... ... ... ...
1137 16.0 26.0 0.99402 3.67 0.56
1138 29.0 40.0 0.99574 3.42 0.75
1139 28.0 38.0 0.99651 3.42 0.82
1140 32.0 44.0 0.99490 3.45 0.58
1141 39.0 51.0 0.99512 3.52 0.76
alcohol Id
0 9.4 0
2 9.8 2
3 9.8 3
4 9.4 4
5 9.4 5
... ... ...
1137 11.6 1591
1138 11.0 1592
1139 9.5 1593
1140 10.5 1594
1141 11.2 1595
[915 rows x 12 columns],
fixed acidity volatile acidity citric acid residual sugar chlorides \
1 7.8 0.880 0.00 2.6 0.098
13 7.9 0.320 0.51 1.8 0.341
14 7.6 0.390 0.31 2.3 0.082
20 7.1 0.710 0.00 1.9 0.080
21 7.8 0.645 0.00 2.0 0.082
... ... ... ... ... ...
1126 7.5 0.520 0.40 2.2 0.060
1127 8.0 0.300 0.63 1.6 0.081
1130 7.4 0.350 0.33 2.4 0.068
1135 5.8 0.610 0.11 1.8 0.066
1142 5.9 0.645 0.12 2.0 0.075
free sulfur dioxide total sulfur dioxide density pH sulphates \
1 25.0 67.0 0.99680 3.20 0.68
13 17.0 56.0 0.99690 3.04 1.08
14 23.0 71.0 0.99820 3.52 0.65
20 14.0 35.0 0.99720 3.47 0.55
21 8.0 16.0 0.99640 3.38 0.59
... ... ... ... ... ...
1126 12.0 20.0 0.99474 3.26 0.64
1127 16.0 29.0 0.99588 3.30 0.78
1130 9.0 26.0 0.99470 3.36 0.60
1135 18.0 28.0 0.99483 3.55 0.66
1142 32.0 44.0 0.99547 3.57 0.71
alcohol Id
1 9.8 1
13 9.2 19
14 9.7 21
20 9.4 28
21 9.8 29
... ... ...
1126 11.8 1575
1127 10.8 1576
1130 11.9 1580
1135 10.9 1587
1142 10.2 1597
[228 rows x 12 columns])
y_train, y_test
(0 5
2 5
3 6
4 5
5 5
..
1137 6
1138 6
1139 6
1140 5
1141 6
Name: quality, Length: 915, dtype: int64,
1 5
13 6
14 5
20 5
21 6
..
1126 6
1127 6
1130 6
1135 6
1142 5
Name: quality, Length: 228, dtype: int64)
# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
# Make predictions on the test set
y_pred = model.predict(X_test)
y_pred
array([5.16186431, 5.43479564, 5.4099847 , 5.04619825, 5.37704568,
5.7990418 , 5.38731241, 5.43093026, 5.35588733, 5.29421816,
6.63569715, 5.62679956, 5.00725958, 4.79458272, 6.24309063,
5.37429889, 6.97932141, 5.70767983, 5.47693236, 5.85157399,
5.08360332, 4.97385585, 5.227863 , 5.40778371, 5.6873337 ,
5.0492026 , 6.05221991, 6.02449667, 5.55668676, 4.9799088 ,
6.84811605, 5.61450834, 5.87182838, 5.4222998 , 5.78893723,
5.91544362, 5.69223483, 4.85279719, 5.2761029 , 5.5057901 ,
6.17136137, 6.26883163, 5.70079611, 5.70076378, 5.2563155 ,
6.22684128, 6.10755136, 6.84600024, 5.2940607 , 5.78932803,
5.74935871, 5.96072529, 5.17025127, 5.16986734, 5.17022067,
5.57877849, 5.96387974, 5.49275985, 6.63407454, 5.77748158,
5.39598662, 5.70716412, 6.55672665, 5.33760978, 5.77942747,
5.50962442, 6.12335577, 5.35827673, 6.10912862, 5.44154945,
5.88810051, 6.25332822, 6.00611659, 6.06743437, 6.40275454,
5.96543164, 5.17794207, 4.73384219, 6.05276512, 5.3526892 ,
4.85885224, 5.16134159, 5.29722373, 5.27316423, 6.02841554,
5.57243444, 5.44138746, 5.41448582, 5.75515053, 5.64434483,
5.16386656, 6.06867888, 4.77733117, 6.0163159 , 5.07399768,
5.59226689, 5.31554421, 4.7729377 , 5.16219757, 5.18661082,
5.55869181, 4.83932641, 5.15024736, 5.41985256, 5.49618322,
5.4961509 , 5.53854026, 6.47043781, 6.18676984, 5.38664136,
5.42814432, 5.9855998 , 6.1910727 , 6.42648999, 5.4435614 ,
4.77444262, 5.61816966, 5.58577652, 5.74073933, 5.80766136,
5.68411174, 6.33403541, 6.45014122, 5.17709068, 5.6430476 ,
6.29329723, 5.85728149, 6.5493569 , 5.80993138, 4.88902914,
6.36817438, 4.89424114, 5.84031068, 5.26049131, 5.4497633 ,
6.18365773, 5.91343057, 6.39614328, 5.016204 , 6.63169597,
5.64698016, 6.21235087, 6.11561583, 6.5747321 , 6.09800881,
6.33180473, 6.38375643, 6.48389748, 5.28843156, 6.73248582,
6.40585391, 6.69928591, 6.21667497, 6.3795492 , 5.57881334,
5.25549336, 5.24929646, 5.37188191, 6.3530185 , 6.04638406,
6.02085125, 5.38647511, 5.79402117, 6.0916635 , 5.12719094,
5.47277288, 6.37687869, 5.38888697, 6.18555351, 5.25150491,
6.20165538, 6.51934611, 5.38417076, 5.12845699, 5.36505494,
5.19289557, 5.28341534, 5.01096604, 5.15909701, 5.62492582,
5.62489349, 5.62479651, 5.67433823, 5.90380965, 5.53311896,
5.73321606, 4.98641665, 4.81652005, 5.13260822, 5.43416856,
5.28721649, 5.1511722 , 6.47609733, 6.92311286, 5.37636023,
5.97669435, 5.72890941, 5.44585044, 5.764088 , 5.4827883 ,
4.92087612, 5.34527458, 6.68606406, 5.38025643, 5.14729562,
5.57927144, 6.08723916, 5.04054407, 5.30379653, 6.56058845,
6.36308623, 5.68673533, 4.8270962 , 5.24619146, 5.53831323,
5.70246751, 5.20148276, 4.77529309, 5.13248835, 5.13245602,
5.35318881, 5.83844359, 6.07998464, 6.16051537, 6.09330422,
6.26019292, 5.65303901, 5.39817093])
# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
mse
0.3776269060991377
# Append the mse to the list of scores
mse_scores.append(mse)
mse_scores
[0.3776269060991377]
# Calculate the average mean squared error
avg_mse = sum(mse_scores) / k
print("Average MSE:", avg_mse)
Average MSE: 0.07552538121982753
avg_mse
0.07552538121982753
import pandas as pd
# Load the dataset
wine_data = pd.read_csv("wineQT.csv")
# Check the column names in the dataset
print(wine_data.columns)
# Assuming 'type' column is present, perform one-hot encoding on the 'type' column
if 'type' in wine_data.columns:
wine_data_encoded = pd.get_dummies(wine_data, columns=['type'])
# Display the first few rows of the encoded dataset
print(wine_data_encoded.head())
else:
print("Column 'type' not found in the dataset.")
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol', 'quality', 'Id'],
dtype='object')
Column 'type' not found in the dataset.
wine_data
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | Id | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 | 0 |
| 1 | 7.8 | 0.880 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.99680 | 3.20 | 0.68 | 9.8 | 5 | 1 |
| 2 | 7.8 | 0.760 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.99700 | 3.26 | 0.65 | 9.8 | 5 | 2 |
| 3 | 11.2 | 0.280 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.99800 | 3.16 | 0.58 | 9.8 | 6 | 3 |
| 4 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1138 | 6.3 | 0.510 | 0.13 | 2.3 | 0.076 | 29.0 | 40.0 | 0.99574 | 3.42 | 0.75 | 11.0 | 6 | 1592 |
| 1139 | 6.8 | 0.620 | 0.08 | 1.9 | 0.068 | 28.0 | 38.0 | 0.99651 | 3.42 | 0.82 | 9.5 | 6 | 1593 |
| 1140 | 6.2 | 0.600 | 0.08 | 2.0 | 0.090 | 32.0 | 44.0 | 0.99490 | 3.45 | 0.58 | 10.5 | 5 | 1594 |
| 1141 | 5.9 | 0.550 | 0.10 | 2.2 | 0.062 | 39.0 | 51.0 | 0.99512 | 3.52 | 0.76 | 11.2 | 6 | 1595 |
| 1142 | 5.9 | 0.645 | 0.12 | 2.0 | 0.075 | 32.0 | 44.0 | 0.99547 | 3.57 | 0.71 | 10.2 | 5 | 1597 |
1143 rows × 13 columns
# performing predictions on wine dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Load the dataset
wine_data = pd.read_csv("wineQT.csv")
# Separate features (X) and target variable (y)
X = wine_data.drop(columns=['quality'])
y = wine_data['quality']
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = model.predict(X_test)
# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
Mean Squared Error: 0.38242835212919646
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
# Load the dataset
wine_data = pd.read_csv("wineQT.csv")
# Separate features (X) and target variable (y)
X = wine_data.drop(columns=['quality'])
y = wine_data['quality']
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = model.predict(X_test)
# Calculate R-squared
r_squared = r2_score(y_test, y_pred)
print("R-squared:", r_squared)
R-squared: 0.3127638539508196